Initial Data Exploration
import numpy as np
import pandas as pd
# Show numeric output in decimal format with 2 decimal places e.g., 2.15
pd.options.display.float_format = '{:,.2f}'.format
df_apps = pd.read_csv(r"C:\Users\4ben\Downloads\Google+Play+Store+Project+(Start)\apps.csv")
df_apps
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | Last_Updated | Android_Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Ak Parti Yardım Toplama | SOCIAL | NaN | 0 | 8.70 | 0 | Paid | $13.99 | Teen | Social | July 28, 2017 | 4.1 and up |
| 1 | Ain Arabic Kids Alif Ba ta | FAMILY | NaN | 0 | 33.00 | 0 | Paid | $2.99 | Everyone | Education | April 15, 2016 | 3.0 and up |
| 2 | Popsicle Launcher for Android P 9.0 launcher | PERSONALIZATION | NaN | 0 | 5.50 | 0 | Paid | $1.49 | Everyone | Personalization | July 11, 2018 | 4.2 and up |
| 3 | Command & Conquer: Rivals | FAMILY | NaN | 0 | 19.00 | 0 | NaN | 0 | Everyone 10+ | Strategy | June 28, 2018 | Varies with device |
| 4 | CX Network | BUSINESS | NaN | 0 | 10.00 | 0 | Free | 0 | Everyone | Business | August 6, 2018 | 4.1 and up |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10836 | Subway Surfers | GAME | 4.50 | 27723193 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade | July 12, 2018 | 4.1 and up |
| 10837 | Subway Surfers | GAME | 4.50 | 27724094 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade | July 12, 2018 | 4.1 and up |
| 10838 | Subway Surfers | GAME | 4.50 | 27725352 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade | July 12, 2018 | 4.1 and up |
| 10839 | Subway Surfers | GAME | 4.50 | 27725352 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade | July 12, 2018 | 4.1 and up |
| 10840 | Subway Surfers | GAME | 4.50 | 27711703 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade | July 12, 2018 | 4.1 and up |
10841 rows × 12 columns
#Shape tells us the shape of an array.
#10841 rows and 12 columns.
df_apps.shape
(10841, 12)
#We can already see that there are some data cleansing activities we need to be executed.
#In the Ratings and Type columns there are NaN (Not a number values) and in the Price column we have dollar signs in our data entries which will cause calculation errors.
# sample(n) method
#We can use sample to give us a number of rows
df_apps.sample(10)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | Last_Updated | Android_Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8571 | Real Madrid App | SPORTS | 4.60 | 161423 | 73.00 | 5,000,000 | Free | 0 | Everyone | Sports | August 1, 2018 | 5.0 and up |
| 3409 | Account Class-12 Solutions (D K Goel) Vol-2 | FAMILY | 4.60 | 124 | 23.00 | 10,000 | Free | 0 | Everyone | Education | April 17, 2018 | 3.0 and up |
| 36 | D+H Reaction Wall | GAME | NaN | 0 | 36.00 | 1 | Paid | $0.99 | Everyone | Arcade | July 24, 2016 | Varies with device |
| 3952 | Swift Dark CM / CM13 Theme | PERSONALIZATION | 4.70 | 1500 | 41.00 | 10,000 | Paid | $1.99 | Everyone | Personalization | January 5, 2017 | 5.0 and up |
| 4996 | FlashLight F.Light | TOOLS | 4.30 | 3943 | 11.00 | 100,000 | Free | 0 | Everyone | Tools | May 23, 2018 | 5.0 and up |
| 1339 | PCOS Guide - Fight PCOS naturally | MEDICAL | NaN | 0 | 5.10 | 100 | Free | 0 | Everyone | Medical | August 4, 2018 | 5.0 and up |
| 8350 | American Airlines | TRAVEL_AND_LOCAL | 3.70 | 16973 | 12.00 | 5,000,000 | Free | 0 | Everyone | Travel & Local | July 18, 2018 | Varies with device |
| 6007 | Pro App for Craigslist | SHOPPING | 4.10 | 5618 | 4.60 | 500,000 | Free | 0 | Teen | Shopping | May 9, 2018 | 4.1 and up |
| 3662 | Access Point Names | COMMUNICATION | 4.00 | 138 | 2.40 | 10,000 | Free | 0 | Everyone | Communication | August 12, 2014 | 2.3.3 and up |
| 5942 | Wheretoget: Shop in style | LIFESTYLE | 4.10 | 6808 | 12.00 | 500,000 | Free | 0 | Teen | Lifestyle | June 5, 2017 | 4.1 and up |
#To remove the unwanted columns, we simply provide a list of the column names ['Last_Updated', ‘Android_Ver'] to the .drop() method.
#By setting axis=1 we are specifying that we want to drop certain columns.
#inplace function permanently modifies the current dataframe in place
df_apps.drop(["Last_Updated","Android_Ver"],axis=1 , inplace=True)
df_apps.sample(10)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 6430 | Used car is the first car - used car purchase,... | AUTO_AND_VEHICLES | 4.60 | 5097 | 19.00 | 1,000,000 | Free | 0 | Everyone | Auto & Vehicles |
| 91 | 4 Paws PH | MEDICAL | NaN | 1 | 29.00 | 5 | Free | 0 | Everyone | Medical |
| 8331 | Univision Deportes: Liga MX, MLS, Fútbol Live | SPORTS | 4.20 | 75545 | 14.00 | 5,000,000 | Free | 0 | Everyone | Sports |
| 10155 | trivago: Hotels & Travel | TRAVEL_AND_LOCAL | 4.20 | 219848 | 12.00 | 50,000,000 | Free | 0 | Everyone | Travel & Local |
| 6507 | CZ File Manager | TOOLS | 2.20 | 876 | 8.70 | 1,000,000 | Free | 0 | Everyone | Tools |
| 8717 | Ginger Keyboard - Emoji, GIFs, Themes & Games | PRODUCTIVITY | 4.40 | 162831 | 43.00 | 5,000,000 | Free | 0 | Everyone | Productivity |
| 2594 | Sam.BN | TOOLS | NaN | 83 | 2.00 | 1,000 | Free | 0 | Everyone | Tools |
| 7227 | Car Driving Simulator Drift | GAME | 4.40 | 19816 | 57.00 | 1,000,000 | Free | 0 | Everyone | Racing |
| 7237 | Discovery K!ds Play! | FAMILY | 4.10 | 19388 | 35.00 | 1,000,000 | Free | 0 | Everyone | Entertainment;Music & Video |
| 2323 | BP-Tech | FAMILY | 3.20 | 12 | 13.00 | 1,000 | Free | 0 | Everyone | Casual |
df_apps_clean = df_apps.dropna() #Drops the NaN values
df_apps_clean.shape # We now have 9365 rows after removing the rows which had NaN values.
(9367, 10)
#Double checking for any nan values and we see a "False" result which means there are no more NaN entries.
c = df_apps_clean.isna().values.any()
c
False
duplicated_rows = df_apps_clean[df_apps_clean.duplicated()]
print(duplicated_rows.shape) # We use print to return a value in combination with other lines of code in a cell.
duplicated_rows.head(10)
(476, 10)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 946 | 420 BZ Budeze Delivery | MEDICAL | 5.00 | 2 | 11.00 | 100 | Free | 0 | Mature 17+ | Medical |
| 1133 | MouseMingle | DATING | 2.70 | 3 | 3.90 | 100 | Free | 0 | Mature 17+ | Dating |
| 1196 | Cardiac diagnosis (heart rate, arrhythmia) | MEDICAL | 4.40 | 8 | 6.50 | 100 | Paid | $12.99 | Everyone | Medical |
| 1231 | Sway Medical | MEDICAL | 5.00 | 3 | 22.00 | 100 | Free | 0 | Everyone | Medical |
| 1247 | Chat Kids - Chat Room For Kids | DATING | 4.70 | 6 | 4.90 | 100 | Free | 0 | Mature 17+ | Dating |
| 1379 | CT Scan Cross Sectional Anatomy | MEDICAL | 4.30 | 10 | 46.00 | 100 | Free | 0 | Everyone | Medical |
| 1616 | JH Blood Pressure Monitor | MEDICAL | 3.70 | 9 | 2.90 | 500 | Free | 0 | Everyone | Medical |
| 1642 | Cardi B Live Stream Video Chat - Prank | DATING | 4.40 | 28 | 3.40 | 500 | Free | 0 | Everyone | Dating |
| 1813 | Diabetes & Diet Tracker | MEDICAL | 4.60 | 395 | 19.00 | 1,000 | Paid | $9.99 | Everyone | Medical |
| 1821 | Transenger – Ts Dating and Chat for Free | DATING | 3.60 | 8 | 14.00 | 1,000 | Free | 0 | Mature 17+ | Dating |
duplicated_rows.shape
(476, 10)
duplicated_rows.head()
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 946 | 420 BZ Budeze Delivery | MEDICAL | 5.00 | 2 | 11.00 | 100 | Free | 0 | Mature 17+ | Medical |
| 1133 | MouseMingle | DATING | 2.70 | 3 | 3.90 | 100 | Free | 0 | Mature 17+ | Dating |
| 1196 | Cardiac diagnosis (heart rate, arrhythmia) | MEDICAL | 4.40 | 8 | 6.50 | 100 | Paid | $12.99 | Everyone | Medical |
| 1231 | Sway Medical | MEDICAL | 5.00 | 3 | 22.00 | 100 | Free | 0 | Everyone | Medical |
| 1247 | Chat Kids - Chat Room For Kids | DATING | 4.70 | 6 | 4.90 | 100 | Free | 0 | Mature 17+ | Dating |
df_apps_clean.duplicated()
21 False
28 False
47 False
82 False
99 False
...
10836 False
10837 False
10838 False
10839 True
10840 False
Length: 9367, dtype: bool
df_apps_clean.duplicated().sum()
#476 duplicate rows
476
df_apps_clean = df_apps_clean.drop_duplicates(subset=["App","Type","Price"]) #Drops the rows where there are duplicates based on App,Type and Price column.
df_apps_clean.shape
(8199, 10)
#Sort apps based on the highest rating
df_apps_clean.sort_values('Rating', ascending=False).head(9)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 21 | KBA-EZ Health Guide | MEDICAL | 5.00 | 4 | 25.00 | 1 | Free | 0 | Everyone | Medical |
| 1230 | Sway Medical | MEDICAL | 5.00 | 3 | 22.00 | 100 | Free | 0 | Everyone | Medical |
| 1227 | AJ Men's Grooming | LIFESTYLE | 5.00 | 2 | 22.00 | 100 | Free | 0 | Everyone | Lifestyle |
| 1224 | FK Dedinje BGD | SPORTS | 5.00 | 36 | 2.60 | 100 | Free | 0 | Everyone | Sports |
| 1223 | CB VIDEO VISION | PHOTOGRAPHY | 5.00 | 13 | 2.60 | 100 | Free | 0 | Everyone | Photography |
| 1222 | Beacon Baptist Jupiter, FL | LIFESTYLE | 5.00 | 14 | 2.60 | 100 | Free | 0 | Everyone | Lifestyle |
| 1214 | BV Mobile Apps | PRODUCTIVITY | 5.00 | 3 | 4.80 | 100 | Free | 0 | Everyone | Productivity |
| 2680 | Florida Wildflowers | FAMILY | 5.00 | 5 | 69.00 | 1,000 | Free | 0 | Everyone | Education |
| 1206 | ADS-B Driver | TOOLS | 5.00 | 2 | 6.30 | 100 | Paid | $1.99 | Everyone | Tools |
#Sort apps based on the Size_MBs (Highest to Lowest)
df_apps_clean.sort_values('Size_MBs', ascending=False).head(9)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 9942 | Talking Babsy Baby: Baby Games | LIFESTYLE | 4.00 | 140995 | 100.00 | 10,000,000 | Free | 0 | Everyone | Lifestyle;Pretend Play |
| 10687 | Hungry Shark Evolution | GAME | 4.50 | 6074334 | 100.00 | 100,000,000 | Free | 0 | Teen | Arcade |
| 9943 | Miami crime simulator | GAME | 4.00 | 254518 | 100.00 | 10,000,000 | Free | 0 | Mature 17+ | Action |
| 9944 | Gangster Town: Vice District | FAMILY | 4.30 | 65146 | 100.00 | 10,000,000 | Free | 0 | Mature 17+ | Simulation |
| 3144 | Vi Trainer | HEALTH_AND_FITNESS | 3.60 | 124 | 100.00 | 5,000 | Free | 0 | Everyone | Health & Fitness |
| 9945 | Ultimate Tennis | SPORTS | 4.30 | 183004 | 100.00 | 10,000,000 | Free | 0 | Everyone | Sports |
| 7926 | Post Bank | FINANCE | 4.50 | 60449 | 100.00 | 1,000,000 | Free | 0 | Everyone | Finance |
| 7927 | The Walking Dead: Our World | GAME | 4.00 | 22435 | 100.00 | 1,000,000 | Free | 0 | Teen | Action |
| 7928 | Stickman Legends: Shadow Wars | GAME | 4.40 | 38419 | 100.00 | 1,000,000 | Paid | $0.99 | Everyone 10+ | Action |
#Sort apps based on the Reviews (Highest to Lowest)
df_apps_clean.sort_values('Reviews', ascending=False).head(20)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 10805 | SOCIAL | 4.10 | 78158306 | 5.30 | 1,000,000,000 | Free | 0 | Teen | Social | |
| 10785 | WhatsApp Messenger | COMMUNICATION | 4.40 | 69119316 | 3.50 | 1,000,000,000 | Free | 0 | Everyone | Communication |
| 10806 | SOCIAL | 4.50 | 66577313 | 5.30 | 1,000,000,000 | Free | 0 | Teen | Social | |
| 10784 | Messenger – Text and Video Chat for Free | COMMUNICATION | 4.00 | 56642847 | 3.50 | 1,000,000,000 | Free | 0 | Everyone | Communication |
| 10650 | Clash of Clans | GAME | 4.60 | 44891723 | 98.00 | 100,000,000 | Free | 0 | Everyone 10+ | Strategy |
| 10744 | Clean Master- Space Cleaner & Antivirus | TOOLS | 4.70 | 42916526 | 3.40 | 500,000,000 | Free | 0 | Everyone | Tools |
| 10835 | Subway Surfers | GAME | 4.50 | 27722264 | 76.00 | 1,000,000,000 | Free | 0 | Everyone 10+ | Arcade |
| 10828 | YouTube | VIDEO_PLAYERS | 4.30 | 25655305 | 4.65 | 1,000,000,000 | Free | 0 | Teen | Video Players & Editors |
| 10746 | Security Master - Antivirus, VPN, AppLock, Boo... | TOOLS | 4.70 | 24900999 | 3.40 | 500,000,000 | Free | 0 | Everyone | Tools |
| 10584 | Clash Royale | GAME | 4.60 | 23133508 | 97.00 | 100,000,000 | Free | 0 | Everyone 10+ | Strategy |
| 10763 | Candy Crush Saga | GAME | 4.40 | 22426677 | 74.00 | 500,000,000 | Free | 0 | Everyone | Casual |
| 10770 | UC Browser - Fast Download Private & Secure | COMMUNICATION | 4.50 | 17712922 | 40.00 | 500,000,000 | Free | 0 | Teen | Communication |
| 10735 | Snapchat | SOCIAL | 4.00 | 17014787 | 5.30 | 500,000,000 | Free | 0 | Teen | Social |
| 10489 | 360 Security - Free Antivirus, Booster, Cleaner | TOOLS | 4.60 | 16771865 | 3.40 | 100,000,000 | Free | 0 | Everyone | Tools |
| 10731 | My Talking Tom | GAME | 4.50 | 14891223 | 36.00 | 500,000,000 | Free | 0 | Everyone | Casual |
| 10594 | 8 Ball Pool | GAME | 4.50 | 14198297 | 52.00 | 100,000,000 | Free | 0 | Everyone | Sports |
| 10302 | DU Battery Saver - Battery Charger & Battery Life | TOOLS | 4.50 | 13479633 | 14.00 | 100,000,000 | Free | 0 | Everyone | Tools |
| 10354 | BBM - Free Calls & Messages | COMMUNICATION | 4.30 | 12842860 | 3.50 | 100,000,000 | Free | 0 | Everyone | Communication |
| 10549 | Cache Cleaner-DU Speed Booster (booster & clea... | TOOLS | 4.50 | 12759663 | 15.00 | 100,000,000 | Free | 0 | Everyone | Tools |
| 10757 | NEWS_AND_MAGAZINES | 4.30 | 11667403 | 6.30 | 500,000,000 | Free | 0 | Mature 17+ | News & Magazines |
We will use: - plotly - a commonly used data visualisation library that you can use in combination with or instead of Matplotlib.
#Counting the number of entries from each Content_rating
ratings = df_apps_clean.Content_Rating.value_counts()
ratings
Everyone 6621 Teen 912 Mature 17+ 357 Everyone 10+ 305 Adults only 18+ 3 Unrated 1 Name: Content_Rating, dtype: int64
#Importing the data visualisation libraries
import pandas as pd
import plotly.express as px
#The px.pie function is called to create the pie chart. We input our labels and values into the variable (fig)
fig = px.pie(labels=ratings.index,values=ratings.values,title="Content Rating",names=ratings.index,)
fig.update_traces(textposition='outside', textinfo='percent+label')
fig.show()
If you’d like to configure other aspects of the chart, that you can’t see in the list of parameters, you can call a method called .update_traces(). In plotly lingo, “traces” refer to graphical marks on a figure. Think of “traces” as collections of attributes. Here we update the traces to change how the text is displayed.
fig = px.pie(labels=ratings.index,values=ratings.values,title="Content Rating",names=ratings.index,hole=0.6,)
fig.update_traces(textposition='inside', textfont_size=15, textinfo='percent')
fig.show()
#Installs data type is not coming up as a number. But rather coming up as an object.
#The reason Python is not recognising our installs as numbers is because of the comma (,) characters in the Installs column.
df_apps_clean.Installs.describe()
count 8199 unique 19 top 1000000 freq 1417 Name: Installs, dtype: object
We can remove the comma (,) character - or any character for that matter - from a DataFrame using the string’s .replace() method. Here we’re saying: “replace the , with an empty string”. This completely removes all the commas in the Installs column. We can then convert our data to a number using .to_numeric().
df_apps_clean.Installs = df_apps_clean.Installs.astype(str).str.replace(',', "")
#removes the comma from the numbers in the installs column and replaces it with an empty/blank string.
df_apps_clean.Installs = pd.to_numeric(df_apps_clean.Installs)
#convert the data in Installs column into a number
#The price is showing as an object rather than a number data type, so we need to change it.
df_apps_clean.Price.describe()
count 8199 unique 73 top 0 freq 7595 Name: Price, dtype: object
#We use replace method to filter out the dollar sign from all the entries in the Price column.
df_apps_clean.Price = df_apps_clean.Price.astype(str).str.replace('$', "")
C:\Users\4ben\AppData\Local\Temp/ipykernel_17652/2313303643.py:2: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
#After removing the dollar sign from the entries in the Price column, we then change price column to a number or float data type
df_apps_clean.Price = pd.to_numeric(df_apps_clean.Price)
df_apps_clean.Price.describe()
count 8,199.00 mean 1.04 std 16.85 min 0.00 25% 0.00 50% 0.00 75% 0.00 max 400.00 Name: Price, dtype: float64
df_apps_clean
# We can see that the dollar signs no longer appear in the data entries within the Price column.
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 21 | KBA-EZ Health Guide | MEDICAL | 5.00 | 4 | 25.00 | 1 | Free | 0.00 | Everyone | Medical |
| 28 | Ra Ga Ba | GAME | 5.00 | 2 | 20.00 | 1 | Paid | 1.49 | Everyone | Arcade |
| 47 | Mu.F.O. | GAME | 5.00 | 2 | 16.00 | 1 | Paid | 0.99 | Everyone | Arcade |
| 82 | Brick Breaker BR | GAME | 5.00 | 7 | 19.00 | 5 | Free | 0.00 | Everyone | Arcade |
| 99 | Anatomy & Physiology Vocabulary Exam Review App | MEDICAL | 5.00 | 1 | 4.60 | 5 | Free | 0.00 | Everyone | Medical |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10824 | Google Drive | PRODUCTIVITY | 4.40 | 2731171 | 4.00 | 1000000000 | Free | 0.00 | Everyone | Productivity |
| 10828 | YouTube | VIDEO_PLAYERS | 4.30 | 25655305 | 4.65 | 1000000000 | Free | 0.00 | Teen | Video Players & Editors |
| 10829 | Google Play Movies & TV | VIDEO_PLAYERS | 3.70 | 906384 | 4.65 | 1000000000 | Free | 0.00 | Teen | Video Players & Editors |
| 10831 | Google News | NEWS_AND_MAGAZINES | 3.90 | 877635 | 13.00 | 1000000000 | Free | 0.00 | Teen | News & Magazines |
| 10835 | Subway Surfers | GAME | 4.50 | 27722264 | 76.00 | 1000000000 | Free | 0.00 | Everyone 10+ | Arcade |
8199 rows × 10 columns
df_apps_clean.sort_values('Price', ascending=False).head(20)
| App | Category | Rating | Reviews | Size_MBs | Installs | Type | Price | Content_Rating | Genres | |
|---|---|---|---|---|---|---|---|---|---|---|
| 3946 | I'm Rich - Trump Edition | LIFESTYLE | 3.60 | 275 | 7.30 | 10000 | Paid | 400.00 | Everyone | Lifestyle |
| 2461 | I AM RICH PRO PLUS | FINANCE | 4.00 | 36 | 41.00 | 1000 | Paid | 399.99 | Everyone | Finance |
| 4606 | I Am Rich Premium | FINANCE | 4.10 | 1867 | 4.70 | 50000 | Paid | 399.99 | Everyone | Finance |
| 3145 | I am rich(premium) | FINANCE | 3.50 | 472 | 0.94 | 5000 | Paid | 399.99 | Everyone | Finance |
| 3554 | 💎 I'm rich | LIFESTYLE | 3.80 | 718 | 26.00 | 10000 | Paid | 399.99 | Everyone | Lifestyle |
| 5765 | I am rich | LIFESTYLE | 3.80 | 3547 | 1.80 | 100000 | Paid | 399.99 | Everyone | Lifestyle |
| 1946 | I am rich (Most expensive app) | FINANCE | 4.10 | 129 | 2.70 | 1000 | Paid | 399.99 | Teen | Finance |
| 2775 | I Am Rich Pro | FAMILY | 4.40 | 201 | 2.70 | 5000 | Paid | 399.99 | Everyone | Entertainment |
| 3221 | I am Rich Plus | FAMILY | 4.00 | 856 | 8.70 | 10000 | Paid | 399.99 | Everyone | Entertainment |
| 3114 | I am Rich | FINANCE | 4.30 | 180 | 3.80 | 5000 | Paid | 399.99 | Everyone | Finance |
| 1331 | most expensive app (H) | FAMILY | 4.30 | 6 | 1.50 | 100 | Paid | 399.99 | Everyone | Entertainment |
| 2394 | I am Rich! | FINANCE | 3.80 | 93 | 22.00 | 1000 | Paid | 399.99 | Everyone | Finance |
| 3897 | I Am Rich | FAMILY | 3.60 | 217 | 4.90 | 10000 | Paid | 389.99 | Everyone | Entertainment |
| 2193 | I am extremely Rich | LIFESTYLE | 2.90 | 41 | 2.90 | 1000 | Paid | 379.99 | Everyone | Lifestyle |
| 3856 | I am rich VIP | LIFESTYLE | 3.80 | 411 | 2.60 | 10000 | Paid | 299.99 | Everyone | Lifestyle |
| 2281 | Vargo Anesthesia Mega App | MEDICAL | 4.60 | 92 | 32.00 | 1000 | Paid | 79.99 | Everyone | Medical |
| 1407 | LTC AS Legal | MEDICAL | 4.00 | 6 | 1.30 | 100 | Paid | 39.99 | Everyone | Medical |
| 2629 | I am Rich Person | LIFESTYLE | 4.20 | 134 | 1.80 | 1000 | Paid | 37.99 | Everyone | Lifestyle |
| 2481 | A Manual of Acupuncture | MEDICAL | 3.50 | 214 | 68.00 | 1000 | Paid | 33.99 | Everyone | Medical |
| 4264 | Golfshot Plus: Golf GPS | SPORTS | 4.10 | 3387 | 25.00 | 50000 | Paid | 29.99 | Everyone | Sports |
df_apps_clean.Category.nunique()
#33 unique categories
33
top10_category = df_apps_clean.Category.value_counts()[:10] # Entries from index number 0 to 10 are returned.
print(top10_category)
#index values are on left column and values are on right column
FAMILY 1610 GAME 910 TOOLS 719 FINANCE 302 LIFESTYLE 302 PRODUCTIVITY 301 PERSONALIZATION 298 MEDICAL 292 PHOTOGRAPHY 263 BUSINESS 262 Name: Category, dtype: int64
bar = px.bar(x = top10_category.index, # index = category name
y = top10_category.values)
bar.show()
#Group all our apps by category and sum the number of installations that each category has had.
category_installs = df_apps_clean.groupby('Category').agg({'Installs': pd.Series.sum})
category_installs.sort_values('Installs', ascending=True, inplace=True)
category_installs
| Installs | |
|---|---|
| Category | |
| EVENTS | 15949410 |
| BEAUTY | 26916200 |
| PARENTING | 31116110 |
| MEDICAL | 39162676 |
| COMICS | 44931100 |
| LIBRARIES_AND_DEMO | 52083000 |
| AUTO_AND_VEHICLES | 53129800 |
| HOUSE_AND_HOME | 97082000 |
| ART_AND_DESIGN | 114233100 |
| DATING | 140912410 |
| FOOD_AND_DRINK | 211677750 |
| EDUCATION | 352852000 |
| WEATHER | 361096500 |
| FINANCE | 455312400 |
| MAPS_AND_NAVIGATION | 503267560 |
| LIFESTYLE | 503742120 |
| BUSINESS | 692018120 |
| SPORTS | 1096431465 |
| HEALTH_AND_FITNESS | 1134006220 |
| SHOPPING | 1400331540 |
| PERSONALIZATION | 1532352930 |
| BOOKS_AND_REFERENCE | 1665791655 |
| ENTERTAINMENT | 2113660000 |
| NEWS_AND_MAGAZINES | 2369110650 |
| TRAVEL_AND_LOCAL | 2894859300 |
| VIDEO_PLAYERS | 3916897200 |
| FAMILY | 4437579590 |
| PHOTOGRAPHY | 4649143130 |
| SOCIAL | 5487841475 |
| PRODUCTIVITY | 5788070180 |
| TOOLS | 8099724500 |
| COMMUNICATION | 11039241530 |
| GAME | 13858762717 |
df_free_vs_paid = df_apps_clean.groupby(["Category", "Type"], as_index=False).agg({'App': pd.Series.count})
df_free_vs_paid.head()
| Category | Type | App | |
|---|---|---|---|
| 0 | ART_AND_DESIGN | Free | 58 |
| 1 | ART_AND_DESIGN | Paid | 3 |
| 2 | AUTO_AND_VEHICLES | Free | 72 |
| 3 | AUTO_AND_VEHICLES | Paid | 1 |
| 4 | BEAUTY | Free | 42 |
g_bar = px.bar(df_free_vs_paid,
x='Category',
y='App',
title='Free vs Paid Apps by Category',
color='Type',
barmode='group')
g_bar.update_layout(xaxis_title='Category',
yaxis_title='Number of Apps',
xaxis={'categoryorder':'total descending'},
yaxis=dict(type='log'))
g_bar.show()